\documentclass[preprint,12pt]{elsarticle}

\usepackage{amsmath,amssymb}
\usepackage{booktabs}
\usepackage{graphicx}
\usepackage{hyperref}
\usepackage{natbib}
\usepackage{array}
\usepackage{tabularx}

\newcolumntype{L}{>{\raggedright\arraybackslash}X}

\journal{Energy Policy}

\begin{document}

\begin{frontmatter}

\title{Do low-carbon city pilots deliver cleaner air? Evidence of delayed gains concentrated in industrial cities}

\author[aff1,aff2]{Zhilong Zhao\corref{cor1}}
\ead{zhilongzhao@scut.edu.cn}

\affiliation[aff1]{organization={School of Journalism and Communication, South China University of Technology}, addressline={Guangzhou}, country={China}}
\affiliation[aff2]{organization={Guangdong--Hong Kong--Macao Greater Bay Area Research Institute of International Communication, South China University of Technology}, addressline={Guangzhou}, country={China}}

\cortext[cor1]{Corresponding author: Zhilong Zhao}

\begin{abstract}
City-level low-carbon pilots are widely used to accelerate energy transition and industrial decarbonisation, yet evidence on local air-quality co-benefits and their distribution across cities remains mixed. This paper estimates the impact of China's Low-carbon City Pilot (LCCP) program on urban air quality using a prefecture-level city-by-year panel spanning 2001--2023. We measure air quality as the annual share of ``good'' days (official grades ``excellent'' or ``good'') from daily AQI records and restrict the sample to city-years with at least 330 monitored days to avoid partial-year measurement artefacts.

Exploiting staggered adoption across cohorts, we compare cities to themselves over time and to contemporaneous non-pilot cities, and we trace outcomes around adoption to assess pre-existing trends. We estimate an average post-adoption increase in the good-day share of about 0.03 (three percentage points). Improvements emerge with a lag of roughly three years, and event-time diagnostics do not indicate strong differential pre-trends in the baseline specification.

The gains are highly heterogeneous. Cities with higher baseline secondary-industry shares exhibit substantially larger estimated improvements (around five percentage points), whereas low-secondary cities show estimates close to zero. Overall, the evidence is consistent with meaningful local air-quality co-benefits of low-carbon pilot governance, concentrated where industrial pollution exposure and abatement potential are greatest. For policy, this suggests that the near-term air-quality returns to low-carbon pilots may be highest in industrial cities and may depend on implementation capacity---monitoring, enforcement, and upgrading support.
\end{abstract}

\begin{keyword}
low-carbon policy \sep air quality \sep staggered adoption \sep event study \sep industrial structure \sep China
\end{keyword}

\end{frontmatter}

\section{Introduction}

Decarbonising cities is central to energy transition because urban areas concentrate energy use, industrial production, and local environmental externalities. Many countries use pilot-style governance to translate national targets into local action, but pilot packages are often complex, heterogeneous, and difficult to evaluate. China’s Low-carbon City Pilot (LCCP) program is a leading example: it aims to reduce carbon intensity, promote clean energy and energy efficiency, and accelerate industrial upgrading through a bundle of targets, monitoring, and policy experimentation.

This paper evaluates an immediate and policy-salient outcome: air-quality co-benefits. While the primary objective of LCCP is low-carbon transition, local implementation is frequently motivated by near-term welfare gains from cleaner air. Existing evaluations focus predominantly on carbon intensity, energy efficiency, and green innovation \citep{Zeng2023,Wang2023,Ma2021}, whereas evidence on air-quality outcomes is more mixed and varies across samples and outcomes \citep{Yan2021,Zhang2022,Gu2023AirPollution}. Two policy questions therefore remain open: (i) is LCCP associated with measurable air-quality improvements in a long panel that spans early and recent cohorts; and (ii) which cities benefit the most?

We address these questions with a city-year panel for 2001--2023 constructed from daily AQI records, city-level pilot adoption, and socioeconomic controls. Our main outcome is the annual share of ``good'' days (air-quality grades ``excellent'' or ``good''), computed as \texttt{good\_days/days}. To reduce partial-year artefacts from monitoring start/stop, we restrict to city-years with at least 330 monitored days. The resulting baseline sample includes 294 cities and 3,765 city-year observations.

Because LCCP adoption is staggered across cohorts, ``before vs.\ after'' comparisons can be sensitive to timing and selection. We therefore estimate dynamic effects around adoption and report pre-trend diagnostics; technical details and robustness checks are provided in Section~4 and Appendix~A.

Substantively, we focus on baseline industrial exposure as a policy-relevant heterogeneity dimension. If low-carbon pilots operate through enforcement, upgrading, and industrial restructuring, then air-quality benefits should be larger where industrial pollution loads and abatement opportunities are greater.

The results deliver a simple, policy-facing message. In the quality-controlled sample, we estimate an average post-adoption increase in the annual good-day share of about 0.03 (three percentage points), with improvements emerging after roughly three years. The gains are concentrated in industrial cities: estimates are close to zero in low-secondary cities but around five percentage points in high-secondary cities. As a contrast, carbon-trading pilot estimates in the same data exhibit strong pre-trends, underscoring the importance of dynamic diagnostics in pilot evaluations \citep{AlmondZhang2021CarbonTradingAirQuality,Weng2022}.

These findings speak to two broader debates. First, policy packages and policy mixes can generate local co-benefits even when the primary objective is carbon mitigation \citep{Li2022,Yang2023_3137,Wu2023}. Second, scaling is not only about whether a pilot ``works'' on average: it is also about where it works and what implementation capacity is needed for benefits to materialise.

\section{Policy background and related literature}

\subsection{The Low-carbon City Pilot (LCCP) program}

LCCP is implemented in multiple cohorts across prefecture-level cities. It is designed as a policy package rather than a single instrument: local governments are expected to set low-carbon targets, improve monitoring and reporting, promote energy-efficiency upgrades, guide industrial restructuring, and coordinate investment in low-carbon infrastructure. As a pilot program, LCCP emphasises experimentation and learning-by-doing, while embedding cities into a monitoring and accountability framework \citep{Guo2022}.

The policy timeline matters for both mechanisms and evaluation. The first LCCP cohort started in 2010, followed by expansions in 2012 and 2017. These waves align with broader national shifts toward binding energy and environmental targets and the use of pilots to experiment with governance and technology pathways \citep{Wang2015,Dienst2013}. At the local level, implementation is typically multi-year: planning, financing, and enforcing upgrading programs can precede observable environmental changes. This motivates our emphasis on dynamic (event-time) effects rather than only contemporaneous post indicators.

From an energy-policy perspective, the most plausible co-benefit channels operate through energy use and industrial processes. Cities can improve air quality by reducing coal dependence, improving energy efficiency, and shifting toward cleaner production. A key implication is that co-benefits may be largest when LCCP accelerates energy-efficiency improvement and energy substitution in high-emitting sectors \citep{Weng2017}. In practice, LCCP action plans often combine ``hard'' instruments (e.g., standards, enforcement, and investment in infrastructure) with ``soft'' instruments (e.g., information, reporting, and performance evaluation). This package nature also implies that impacts may be heterogeneous and delayed rather than immediate.

Empirical work evaluates LCCP along multiple dimensions. One strand links LCCP to carbon-related outcomes such as carbon intensity and carbon abatement \citep{Zeng2023,Liu2022,Liu2022_0188} and to energy efficiency and energy--environmental efficiency \citep{Wang2023,Yang2023,Li2025}.

A second strand focuses on innovation and firm responses, including green innovation and upgrading of high-emitting enterprises \citep{Ma2021,Chen2022,Liu2023,Wang2022_9002}.

A third strand studies broader urban outcomes, including productivity/green development, industrial upgrading, and spatial spillovers \citep{Chen2021,Zhong2024,Li2024_4830}. Taken together, these findings suggest that LCCP is consequential, but they also indicate heterogeneity by local conditions and implementation capacity.

\subsection{Overlap with other pilot policies}

China’s low-carbon governance relies on multiple pilots that can overlap spatially and temporally. The carbon emissions trading scheme (ETS) pilots, for example, started in 2013--2016, and a growing literature examines ETS impacts on air pollution and coordinated carbon--pollution outcomes \citep{Weng2022,Shi2022,Liu2021}.

Other pilots---such as green finance reform zones, innovative-city pilots, smart-city pilots, and energy-quota trading---also target innovation, efficiency, and emissions \citep{Zhang2023,Gao2024,Du2023}. This policy environment strengthens the case for careful evaluation diagnostics: if pilot placement is correlated with pre-existing trends or anticipatory actions, pooled estimates can be misleading even when point estimates appear plausible.

\subsection{Pilot governance, implementation, and policy mixes}

Energy-transition policies often operate through \emph{policy mixes}: bundles of regulatory, market-based, and informational instruments that jointly shape incentives and constraints \citep{Wu2023}. LCCP fits this logic. It combines quantified targets, monitoring and reporting, and a mandate for local experimentation. In practice, cities frequently pair the pilot with complementary actions such as energy-efficiency retrofits, industrial upgrading plans, clean-energy infrastructure investment, and green finance initiatives. Empirical evidence from China’s pilot landscape supports the view that different pilot instruments can reinforce one another or operate through related channels. For example, green finance reforms are linked to technological innovation and energy efficiency \citep{Zhang2023,Gao2024}, and carbon trading pilots are studied in relation to coordinated carbon and air-pollution reductions \citep{Shi2022,Li2024}.

This ``package + mix'' perspective has two implications for evaluation and policy learning. First, program impacts can be delayed: implementation requires time for planning, compliance systems, and investment. Second, average effects can be heterogeneous because local capacity, industrial base, and complementary policies differ. In the LCCP context, implementation salience may be reflected in governance and information channels, including the use of digital tools and communication. For example, city-level studies examine how pilots interact with the digital economy, innovation, and enterprise behavior \citep{Wang2023_0339,Wang2022_9002}. These considerations motivate our emphasis on dynamic patterns and heterogeneity by baseline industrial exposure.

Pilot governance also creates an opportunity for structured policy learning. Ideally, pilot programs generate information about what works, where, and under what conditions; this information can then be used to revise program rules and to scale the most effective packages. Case-based and implementation-focused research highlights that pilots can differ markedly in local design and execution \citep{Dienst2013,Wang2015,Guo2022}. For evaluation, this reinforces two points: measured outcomes should be tied to clear implementation benchmarks, and heterogeneous effects should be interpreted as informative signals for targeting and capacity building rather than as statistical noise.

\subsection{Air-quality co-benefits and heterogeneity}

Theoretical and policy arguments for co-benefits emphasize that measures reducing fossil energy use and improving efficiency often also reduce local air pollution \citep{Chen2021_8008,Li2022}. However, the co-benefit magnitude depends on baseline sources, local industrial composition, enforcement capacity, and the speed of upgrading. Empirical studies report that LCCP can reduce haze/PM$_{2.5}$ or improve air quality, though estimates differ across outcomes, time windows, and identification choices \citep{Yan2021,Zhang2022,He2023,Gu2023AirPollution}.

Related work evaluates coordinated reductions of carbon and air pollution under other policy instruments and policy mixes, including carbon trading and other regulatory packages \citep{Shi2022,Shao2023,Li2024}.

We focus on baseline industrial exposure as an economically interpretable heterogeneity dimension. Industrial cities typically have higher local pollution loads and larger abatement opportunities from process upgrading and enforcement. If LCCP strengthens monitoring and accountability and accelerates upgrading, marginal gains should be larger in high-secondary cities. Conversely, if a city’s air quality is driven primarily by transport/residential sources or by regional pollution transport beyond the scope of city governance, average effects may be modest. This framing is consistent with work linking LCCP to industrial structure upgrading and to synergistic carbon--pollution governance \citep{Zhong2024,Li2022}.

This heterogeneity perspective is consistent with two patterns documented in the broader LCCP literature. First, low-carbon pilots often affect multiple margins simultaneously---energy efficiency, innovation, and industrial upgrading---and these margins are more relevant in industrial settings \citep{Wang2023,Yang2023,Zhong2024}. Second, some studies explicitly frame LCCP as a ``synergistic governance'' instrument that can improve coordinated control of carbon and air pollution when abatement potential is high \citep{Li2022,Yang2023_3137}. Our analysis contributes by showing that this synergy is reflected in a transparent, welfare-relevant air-quality metric and by quantifying how strongly the average effect depends on baseline industrial exposure.

For \emph{Energy Policy}, an additional motivation for studying co-benefits is implementation feasibility. The co-benefits literature emphasizes that coordinated carbon--pollution control can deliver near-term local welfare improvements alongside longer-run climate goals \citep{Li2022,Yang2023_3137}. This logic also implies that evaluating co-benefits with long panels and credible diagnostics is important for policy learning, especially in pilot-based governance systems.

\subsection{Where this paper fits}

This paper is positioned at the intersection of three literatures that are central to \emph{Energy Policy}. First, it contributes to evidence on low-carbon pilot governance by focusing on a local-welfare outcome that is immediately policy-salient. Many studies emphasize carbon-related outcomes (intensity, efficiency, innovation) \citep{Zeng2023,Wang2023,Yang2023,Ma2021,Chen2021}; we complement these by quantifying air-quality co-benefits using a transparent outcome that policymakers routinely communicate (the fraction of ``good'' air-quality days). Second, it contributes to the empirical co-benefits literature by showing that effects are not uniform: the same pilot governance package is associated with much larger gains in industrial cities. Third, it contributes to policy evaluation practice in staggered-adoption settings by combining cohort-specific event studies with a set of robustness checks (reported in Appendix~A), and by illustrating how pre-trends can arise for other pilots (ETS) even when pooled DID estimates are positive.

Relative to existing air-quality studies of LCCP, our approach has two practical differences. First, many studies focus on specific pollutants (e.g., PM$_{2.5}$) or on short windows tied to post-2013 monitoring regimes \citep{Yan2021,Zhang2022,He2023}. We instead use a long panel and a stable city-year outcome constructed from daily AQI records, which allows us to assess dynamics and heterogeneity across multiple cohorts. Second, we explicitly address measurement completeness by screening city-years with insufficient monitoring days. This is a small design choice, but it improves interpretability: a yearly fraction is only meaningful when the underlying coverage approximates a full year.

The paper also helps clarify the relationship between low-carbon governance and other pilot-based instruments. The ETS comparison is not intended as a comprehensive evaluation of carbon trading; rather, it illustrates a general lesson: in pilot systems, selection and anticipatory behavior are plausible, and evaluation claims should be supported by dynamic diagnostics. Appendix~A reports overlap exclusions, alternative event windows, and a permutation test to provide a more conservative robustness picture.

The resulting policy insight is not merely whether ``LCCP works'' on average. Rather, the evidence points to \emph{targeted scaling}: larger estimated air-quality gains appear in more industrial cities, where abatement potential is plausibly higher and where implementation capacity may translate targets into upgrading and enforcement. This is a practical contribution for energy-transition governance, where policy resources are limited and where pilot programs are often proposed as scalable tools.

\subsection{Testable hypothesis}

\textbf{H1 (industrial exposure).} LCCP is associated with improved air quality on average, and estimated effects are larger in cities with higher pre-treatment secondary-industry shares.

\section{Data}

\subsection{Data sources and construction}

We assemble a prefecture-level city-year panel spanning 2000--2023 from four sources. First, daily city AQI records (2001--2024) are aggregated to city-year outcomes. We compute the annual number of monitored days (\texttt{days}), the number of ``good'' days (\texttt{good\_days}) defined by official air-quality grades ``excellent'' or ``good'', and the annual mean AQI (\texttt{aqi\_mean}). Second, city-level CO$_2$ emissions (tons) are matched by city code and year. Third, socioeconomic controls and industrial structure shares are drawn from a city statistical database (GDP, population, secondary- and tertiary-industry shares). Fourth, pilot adoption years are collected for LCCP cohorts (2010, 2012, 2017) and carbon-trading pilot cohorts (2013, 2014, 2016).

Our baseline analysis uses air-quality outcomes for 2001--2023. To reduce partial-year measurement artefacts (e.g., monitoring start or missing months), we restrict to city-years with at least 330 monitored days. This yields 3,765 observations across 294 cities.

\subsection{Data quality and outcome construction}

Air-quality monitoring coverage expands over time and may contain partial-year records when monitoring systems start or data are intermittently missing. Because our main outcome is a yearly fraction, partial coverage can mechanically change the denominator (\texttt{days}) and introduce spurious changes in the good-day share. The baseline threshold of 330 monitored days is therefore designed as a transparent quality screen that approximates full-year coverage while retaining a large sample. The results should be interpreted as applying to the ``well-measured'' city-years that pass this screen; the screen trades off sample size against measurement reliability.

\subsection{Variables}

\textbf{Outcome.} The main outcome is the annual good-day share:
\begin{equation}
  \text{GoodDayShare}_{it} = \frac{\text{good\_days}_{it}}{\text{days}_{it}},
\end{equation}
where \texttt{good\_days} counts days with official air-quality grades ``excellent'' or ``good''. This measure is bounded in $[0,1]$ and directly interpretable as the fraction of days in a year that meet commonly communicated air-quality standards.

We focus on the good-day share for two reasons. First, the ``excellent/good'' grading is widely communicated in policy practice and maps naturally into a welfare-relevant quantity (how often residents experience acceptable air quality). Second, it is available throughout our analysis period because it is constructed from daily AQI records and categorical grades, whereas consistent reporting of annual mean AQI is more limited in early years. In sensitivity checks, we also examine mean AQI in the post-2014 subsample and obtain qualitatively similar patterns (available upon request and in the replication package).

\textbf{Treatment.} For each policy, we define a city-specific pilot year. The post indicator switches on from the pilot year onward, and the DID term equals \texttt{Treat$_i$ $\times$ Post$_{it}$}. For LCCP, there are 126 treated cities in our panel (72 in 2010, 26 in 2012, 28 in 2017). For carbon trading, there are 46 treated cities (24 in 2013, 13 in 2014, 9 in 2016).

\textbf{Industrial exposure.} Baseline secondary-industry share is measured as the average secondary-industry share (\%) over 2007--2009, preceding the first LCCP cohort. We split cities into high- and low-secondary groups by the median of this baseline measure.

\begin{table}[htbp]
  \centering
  \caption{Descriptive statistics (baseline sample: days$\ge$330).}
  \label{tab:desc}
  \begin{tabular}{lrrrr}
    \toprule
    Variable & Mean & SD & Min & Max \\
    \midrule
    Good-day share & 0.828 & 0.147 & 0.231 & 1.000 \\
    AQI mean & 76.047 & 20.236 & 33.850 & 175.344 \\
    log(CO$_2$/GDP) & 0.320 & 0.833 & -2.607 & 2.955 \\
    log(GDP) & 16.694 & 0.978 & 13.859 & 19.973 \\
    Secondary-industry share (\%) & 44.122 & 11.031 & 10.680 & 90.970 \\
    Population (10k) & 453.264 & 388.634 & 24.000 & 3209.000 \\
    \bottomrule
  \end{tabular}
  \begin{minipage}{\textwidth}
    \footnotesize\raggedright Notes: Summary statistics are computed on the analysis sample restricted to city-years with at least 330 monitored days. log(GDP) is the natural log of GDP in 10,000 yuan.
  \end{minipage}
\end{table}

\section{Empirical strategy}

\subsection{Baseline DID specification}

We first estimate a standard two-way fixed effects DID model:
\begin{equation}
  y_{it} = \alpha_i + \gamma_t + \beta\,(\text{LCCP}_i \times \text{Post}_{it}) + \varepsilon_{it},
  \label{eq:did}
\end{equation}
where $y_{it}$ is the good-day share, $\alpha_i$ are city fixed effects capturing time-invariant differences (e.g., geography and long-run industrial base), and $\gamma_t$ are year fixed effects capturing national shocks (e.g., macro cycles and nationwide environmental campaigns). Standard errors are clustered at the city level \citep{BertrandDufloMullainathan2004}. We interpret $\beta$ as the average post-adoption change in treated cities relative to their own pre-treatment levels and the contemporaneous change in control cities.

\subsection{Identification assumptions and diagnostics}

The DID interpretation relies on a parallel-trends assumption: absent LCCP, treated cities would have followed the same trend in air quality as comparable control cities after accounting for city and year fixed effects. In staggered designs, this assumption is cohort-specific and conventional pooled estimators can be misleading under heterogeneous effects \citep{GoodmanBacon2021,CallawaySantAnna2021}. We therefore treat lead coefficients in Equation~\ref{eq:event} as an empirical diagnostic: if treated cohorts show systematic pre-treatment changes relative to not-yet-treated/never-treated units, causal interpretation is less credible \citep{SunAbraham2021}. We emphasize lead joint tests (event time $\le -2$) as a compact summary, while noting that lead tests are not definitive and can have limited power.

We also consider three common threats. First, \emph{anticipation}: cities may take actions before official adoption if pilot selection is known in advance. Second, \emph{policy overlap}: other contemporaneous pilots and campaigns may affect air quality. Third, \emph{spillovers}: if LCCP affects neighboring cities’ air quality through regional transport or industrial relocation, stable-unit-treatment assumptions may be violated \citep{Li2024_4830}. Our design does not fully resolve these threats, but the long panel, dynamic profiles, and heterogeneity patterns provide informative evidence for policy discussion.

\subsection{Staggered adoption and cohort-specific event studies}

Because LCCP adoption is staggered, pooled TWFE event-study estimators can be biased when treatment effects vary over time or across cohorts \citep{GoodmanBacon2021,CallawaySantAnna2021}. We therefore implement cohort-specific event studies following \citet{SunAbraham2021}. Let $G_i$ denote the cohort (pilot year) for treated city $i$. We estimate:
\begin{equation}
  y_{it} = \alpha_i + \gamma_t + \sum_{k \neq -1} \beta_k \,\mathbb{1}[t-G_i=k] + \varepsilon_{it},
  \label{eq:event}
\end{equation}
with event time $k=-1$ as the omitted reference. We report cohort-weighted estimates and 95\% confidence intervals. As a diagnostic for differential pre-trends, we report a joint test of lead coefficients (event time $\le -2$).

For LCCP, we use an event window of $k \in [-8,6]$ years relative to adoption, which balances dynamics with sample support in early cohorts. Estimates are aggregated across cohorts with weights proportional to cohort sizes, following the cohort-weighted construction in \citet{SunAbraham2021}. For carbon-trading pilots, the window is shorter because cohorts start later and sample support for long leads is limited.

\subsection{Heterogeneity by baseline industrial exposure}

To test H1, we split cities by the median baseline secondary-industry share (2007--2009) and estimate Equations~\ref{eq:did}--\ref{eq:event} separately for high- and low-secondary groups. This split is chosen for transparency and policy interpretation (targeting), rather than as a structural mechanism decomposition.

\section{Results}

\subsection{Average effects on air quality}

LCCP pilots are associated with an improvement in annual air-quality performance. In our baseline specification (Table~\ref{tab:did_main}), we estimate an average increase in the annual good-day share of 0.0296 (SE 0.0120), i.e., about three percentage points. Relative to the sample mean of 0.828, this corresponds to roughly 11 additional ``good'' days per year for an average city.

The size of this gain is also meaningful relative to the cross-city dispersion in Table~\ref{tab:desc}. The 25th percentile of the good-day share is about 0.75, whereas the median is about 0.86 in our baseline sample. A three-point increase therefore corresponds to moving a typical city a non-trivial distance in the distribution of annual air-quality performance. Because the effect is much larger in industrial cities (Section~5.3), the implied gain in those cities is approximately 0.05, i.e., about 19 additional good days per year.

For comparison, the carbon-trading estimate is also positive. However, we treat it as descriptive because the corresponding event-time profile exhibits strong pre-trends (Section~\ref{sec:ct_compare}). This contrast illustrates why pilot evaluation in staggered settings should be anchored in dynamic diagnostics rather than relying only on pooled ``after vs.\ before'' point estimates.

\begin{table}[htbp]
  \centering
  \caption{Baseline DID: pilot adoption and good-day share (days$\ge$330).}
  \label{tab:did_main}
  \begin{tabular}{lrrr}
    \toprule
    Specification & Coef. & SE (clustered) & $N$ \\
    \midrule
    Good-day share $\sim$ LCCP DID & 0.0296 & 0.0120 & 3760 \\
    Good-day share $\sim$ Carbon-trading DID & 0.0328 & 0.0172 & 3760 \\
    \bottomrule
  \end{tabular}
  \begin{minipage}{\textwidth}
    \footnotesize\raggedright Notes: City and year fixed effects; standard errors clustered by city.
  \end{minipage}
\end{table}

\subsection{Dynamic effects and pre-trends}

Figure~\ref{fig:lccp_event} traces changes in air quality around LCCP adoption. Improvements appear with a lag: estimated changes become positive around three years after adoption. This pattern is consistent with a policy package that requires time for planning, upgrading, and enforcement to translate into observed air-quality improvements \citep{Wang2015,Guo2022}. Appendix~A reports pre-trend diagnostics, which do not indicate strong differential pre-trends for LCCP in the baseline specification.

The dynamic profile also clarifies why average post indicators can be misleading in policy evaluation. If effects arrive with multi-year lags, short panels or short post windows will mechanically understate policy impacts, while evaluations that start after early implementation steps may overstate immediate effects. For LCCP, the lagged improvement is consistent with an implementation sequence in which cities first develop plans and compliance systems, then upgrade industrial processes and tighten enforcement, and only subsequently observe sustained changes in measured air quality.

\begin{figure}[htbp]
  \centering
  \includegraphics[width=0.9\textwidth]{figures/Figure_1.png}
  \caption{LCCP adoption and good-day share: cohort-weighted event study (days$\ge$330).}
  \label{fig:lccp_event}
\end{figure}

Robustness checks (alternative monitoring thresholds, city trends, dropping municipalities, and a placebo adoption test) are reported in Appendix~A.

\subsection{Heterogeneity by baseline secondary-industry share}

Table~\ref{tab:did_hetero} and Figure~\ref{fig:hetero} show heterogeneity by baseline industrial exposure. In low-secondary cities, the estimated average effect is close to zero. In high-secondary cities, we estimate an increase in the good-day share of about 0.052 (SE 0.017), roughly five percentage points. The group-specific event studies mirror this pattern: improvements are concentrated in high-secondary cities and emerge after adoption with a lag.

This heterogeneity is consistent with a ``marginal abatement'' interpretation. Industrial cities have higher baseline emissions from power and industrial processes and therefore more scope for upgrading and enforcement to translate into measurable changes in air quality. The heterogeneity also helps reconcile mixed findings in the existing literature: samples with different compositions (industrial vs.\ service-oriented cities, coastal vs.\ inland regions) can yield different average effects even if the underlying policy mechanisms are similar \citep{Yan2021,Zhang2022,He2023,Li2024_4830}. For policy, the results suggest that targeting may matter: if pilot resources are limited, prioritising cities with higher industrial exposure may yield larger and more visible local co-benefits.

This evidence is consistent with H1 and suggests that estimated air-quality gains are larger where industrial pollution loads and abatement opportunities are greater. From a policy design perspective, it implies that scaling low-carbon pilot governance uniformly across cities may underdeliver compared to approaches that prioritise industrial cities and pair targets with implementation resources.

\begin{table}[htbp]
  \centering
  \caption{Heterogeneity: baseline secondary-industry share split (days$\ge$330).}
  \label{tab:did_hetero}
  \begin{tabular}{lrrr}
    \toprule
    Group & Coef. (LCCP DID) & SE (clustered) & $N$ \\
    \midrule
    Low-secondary cities & 0.0010 & 0.0129 & 1835 \\
    High-secondary cities & 0.0521 & 0.0172 & 1854 \\
    \bottomrule
  \end{tabular}
  \begin{minipage}{\textwidth}
    \footnotesize\raggedright Notes: Cities split by median baseline secondary-industry share (2007--2009).
  \end{minipage}
\end{table}

\begin{figure}[htbp]
  \centering
  \includegraphics[width=0.9\textwidth]{figures/Figure_2a.png}\\[2mm]
  \includegraphics[width=0.9\textwidth]{figures/Figure_2b.png}
  \caption{Heterogeneity by baseline secondary-industry share: low-secondary (top) vs.\ high-secondary (bottom).}
  \label{fig:hetero}
\end{figure}

\subsection{Contrast: carbon-trading pilots and identification challenges}
\label{sec:ct_compare}

Figure~\ref{fig:ct_event} reports the cohort-weighted event study for carbon-trading pilots. Despite a positive pooled DID estimate, the event-time profile shows pronounced pre-trends (lead joint test p$\approx$0.00). This pattern is consistent with selective placement, anticipatory action, or differential trends in pilot cities. It cautions against interpreting the carbon-trading DID estimate as causal without a stronger research design. This diagnostic perspective complements existing evidence linking carbon-trading pilots to air-quality outcomes \citep{AlmondZhang2021CarbonTradingAirQuality,Weng2022,Liu2021,Shi2022}.

For policy, the comparison is useful even if it is not the main focus of the paper. Carbon trading is a market-based instrument whose effectiveness depends on allowance allocation, monitoring, enforcement, and market liquidity. Pilot jurisdictions may also have stronger baseline environmental governance or stronger incentives to improve air quality, which can generate pre-trends \citep{Weng2022,Shi2022}. Evaluations that do not address such selection can overstate the causal impact of carbon trading or conflate policy impacts with underlying governance differences. A practical implication is that pilot evaluations should routinely report event-time diagnostics and, where feasible, use designs that better isolate plausibly exogenous variation \citep{AlmondZhang2021CarbonTradingAirQuality}.

\begin{figure}[htbp]
  \centering
  \includegraphics[width=0.9\textwidth]{figures/Figure_3.png}
  \caption{Carbon-trading pilot and good-day share: cohort-weighted event study (days$\ge$330).}
  \label{fig:ct_event}
\end{figure}

\section{Discussion}

\subsection{Why are benefits concentrated in industrial cities?}

The heterogeneity pattern suggests that LCCP is associated with larger air-quality gains where baseline industrial exposure is greater. A plausible interpretation is that LCCP governance (targets, monitoring, and accountability) unlocks larger ``low-hanging fruit'' in industrial cities: energy-efficiency retrofits, process upgrading, and stricter enforcement of local pollution controls. This interpretation is consistent with evidence linking LCCP to improvements in energy efficiency and lower energy use in industrial settings \citep{Wang2023,Yang2023} and to green innovation and upgrading \citep{Ma2021,Chen2022,Zhong2024}.

At the same time, near-zero average effects in low-secondary cities do not imply that low-carbon transition is ineffective there. Rather, it may indicate that (i) baseline local air pollution is less tied to industrial sources and more driven by transport/residential sources; (ii) co-benefits depend on complementary measures (e.g., transport policy); or (iii) local air quality is strongly affected by regional pollution transport and spillovers beyond city governance \citep{Li2024_4830}. For policy, this underscores that low-carbon pilots may need to be tailored: governance packages that focus on industrial upgrading may yield the most visible air-quality gains where industrial exposure is high.

\subsection{Policy targeting, scaling, and policy mixes}

The empirical patterns point to practical design choices for scaling low-carbon pilots. Targeting industrial cities is one lever, but targeting alone may be insufficient if local capacity is weak. For high-secondary cities, policy packages that combine clear targets with monitoring, enforcement, and financing for upgrading may be more likely to translate into measurable air-quality improvements. This is consistent with firm- and city-level evidence that low-carbon pilots can affect upgrading and green innovation \citep{Wang2022_9002,Ma2021,Chen2022,Liu2023} and with arguments that policy mixes---rather than isolated instruments---often drive energy-transition outcomes \citep{Wu2023}.

For low-secondary cities, our near-zero average effects suggest that air-quality co-benefits may require different bundles. In such cities, the most binding constraints may lie in transportation, residential energy use, or regional transport. Low-carbon governance could therefore be paired with transport electrification, building retrofits, or regional coordination mechanisms. More generally, pilot programs may work best when they are embedded in a coherent sequencing of policies (e.g., combining pilots on energy-use rights, carbon trading, and low-carbon city governance) \citep{Zhang2025,Du2023}.

\subsection{Implementation design: what should ``capacity'' mean in practice?}

For implementation, the results highlight that the ``implementation capacity'' requirement is not merely conceptual: the estimated lag of about three years is consistent with an implementation pipeline involving planning, investment, and enforcement. For industrial cities, three design elements appear particularly relevant.

\textbf{First, measurement and monitoring.} Cities need high-frequency and credible monitoring of both carbon-related indicators (energy use and emissions accounting) and local air-quality indicators. Monitoring is not only about data collection; it also supports enforcement by making non-compliance detectable. The data-quality screen we apply (days$\ge$330) highlights a practical lesson: without consistent monitoring coverage, evaluation and accountability can be undermined \citep{Guo2022}.

\textbf{Second, incentives and accountability.} Pilot governance typically relies on targets and performance evaluation. Clear responsibility assignment across agencies (environment, industry, energy, and finance) can reduce coordination failures. Evidence on LCCP implementation emphasizes that local execution varies and that policy implementation quality is an important determinant of outcomes \citep{Guo2022}. Aligning promotion incentives with verifiable outcomes and avoiding excessive short-term ``campaign'' styles can help sustain improvements.

\textbf{Third, enabling upgrading.} Industrial cities often require financing and technical support for retrofits, cleaner production, and substitution toward less polluting energy sources. Green finance and innovation-oriented pilot programs are potentially complementary in this regard \citep{Zhang2023,Gao2024}. Policy mixes should therefore be designed to relax constraints on adoption of energy-efficient and cleaner technologies rather than relying solely on mandates.

Operationally, these design elements can be translated into a small set of trackable indicators that support both implementation management and evaluation. Examples include (i) monitoring completeness (days of valid air-quality monitoring per year and coverage of major emission sources), (ii) enforcement intensity (inspection frequency, penalties, and compliance rates), (iii) upgrading progress (retrofit completion rates, closure of obsolete capacity, and energy-efficiency improvements), and (iv) innovation and diffusion signals (green patenting and adoption of low-carbon technologies) \citep{Ma2021,Chen2022,Liu2023}. Embedding such indicators into pilot governance can help distinguish between ``policy announced'' and ``policy implemented'' and can support mid-course corrections before outcome indicators (air quality) fully respond.

These elements suggest that scaling low-carbon pilots may need to be coupled with capacity-building resources and explicit, measurable intermediate milestones (e.g., monitoring completeness, retrofit completion rates, and enforcement actions), in addition to outcome targets.

\subsection{Implementation salience: an exploratory proxy}

Because LCCP is a package, quantifying ``implementation intensity'' is challenging. As a descriptive complement, we construct a proxy for local low-carbon agenda salience from prefecture-level government WeChat posts (2013--2023) by counting low-carbon-related keywords. In our data, LCCP adoption is associated with a modest increase in low-carbon-related posting intensity, but estimates are imprecise and do not support a strong communication channel. We therefore treat this proxy as an indicator of policy attention rather than evidence of causal mediation, consistent with the broader point that implementation quality varies across cities \citep{Guo2022}.

\subsection{Co-benefits, incentives, and political economy}

The co-benefit framing has an important governance implication: policies that are nominally designed for carbon mitigation may be implemented more forcefully when they align with short-term local objectives. Air quality is a local and highly salient outcome, and improvements can generate immediate political and social returns. This helps explain why industrial cities---where baseline pollution exposure is higher---may have both greater technical abatement potential and stronger incentives to implement upgrading and enforcement, consistent with the pilot-governance perspective in the LCCP policy and implementation literature \citep{Wang2015,Guo2022}. Conversely, in cities where local air quality is less industrial or is strongly driven by regional transport, the same pilot governance package may not translate into a visible improvement, potentially weakening incentives for sustained implementation.

This political economy perspective strengthens the policy relevance of heterogeneity results. For program design, it suggests that pilots may be most effective when targeted to high-abatement-potential cities and paired with monitoring and milestone systems that maintain incentives when outcomes are delayed. It also suggests that communicating co-benefits should be done carefully: overstating immediate improvements risks undermining credibility, whereas emphasizing realistic lags and intermediate milestones can sustain support.

\subsection{Regional coordination and spillovers}

Air quality is inherently spatial: pollutants travel across administrative boundaries, and industrial relocation can shift emissions across jurisdictions. These features imply two policy considerations. First, city-focused pilot programs may need regional coordination mechanisms to fully realise co-benefits, especially in densely connected urban clusters. Second, evaluation results may understate or misattribute impacts if spillovers are large. Our city-year design does not model spatial spillovers explicitly, so the estimates should be interpreted as net local changes rather than as a full accounting of regional welfare effects. Future work could combine pilot adoption with atmospheric transport models or spatial econometric designs to quantify spillovers and to study whether low-carbon pilots shift pollution to neighboring jurisdictions \citep{Li2024_4830}.

\subsection{Implications for evaluation practice}

Beyond the substantive findings, the analysis illustrates two evaluation practices that are important for energy-policy learning. First, staggered-adoption pilots should be evaluated with methods that are robust to treatment-effect heterogeneity and should report dynamic diagnostics \citep{GoodmanBacon2021,CallawaySantAnna2021,SunAbraham2021}. Event-time profiles help clarify timing (lags) and whether pre-trends undermine causal interpretation. Second, policy overlap should be explicitly examined. Our ETS comparison shows how pre-trends can arise even when pooled DID estimates are positive, and Appendix~A demonstrates that the LCCP estimate remains positive after excluding ETS overlap. For policy institutions that rely on pilots as learning devices, building these diagnostics into routine evaluation protocols can reduce the risk of overclaiming.

\subsection{Limitations and scope for future research}

Three limitations are important for policy interpretation. First, while event-time lead tests reduce concern about differential pre-trends for LCCP in our baseline specification, pilot placement may still correlate with unobserved time-varying factors (e.g., concurrent local campaigns or evolving capacity). Second, the good-day share is an interpretable composite measure but does not reveal which pollutants drive improvements. Future work could connect LCCP to source-specific emissions and to health outcomes. Third, China’s policy environment involves overlapping instruments (e.g., carbon trading, green finance, innovative-city pilots) that may interact \citep{Li2024,Zhang2023,Gao2024,Wang2023_4383}. Understanding policy mixes and sequencing is a key agenda for energy and environmental governance \citep{Wu2023}.

A further limitation is that our city-year design abstracts from within-city distributional effects. Air-quality improvements may not be evenly distributed across neighborhoods, and the welfare gains depend on population exposure and baseline health conditions. Related evidence suggests that low-carbon pilots can affect socioeconomic outcomes such as labor earnings and population dynamics \citep{Li2025_2762,Chen2023}. Future work could link pilot adoption to health and distributional outcomes and examine whether low-carbon governance produces regressive or progressive effects across groups. For transparency and cumulative policy learning, we will provide a replication package including code, documentation, and derived analysis panels, subject to third-party licensing constraints on upstream sources.

\section{Conclusions and policy implications}

We evaluate the air-quality co-benefits of China's Low-carbon City Pilot (LCCP) program using a long prefecture-level city-year panel and a transparent outcome: the annual share of ``good'' days (official grades ``excellent'' or ``good'') from daily AQI records. In a quality-controlled sample (at least 330 monitored days per year), we estimate an average increase in the good-day share of about 0.03 (three percentage points), with improvements emerging after a lag of roughly three years.

\textbf{Take-home message:} low-carbon city pilots can deliver cleaner air, but benefits arrive with multi-year lags and are concentrated in industrial cities.

\textbf{Where the benefits are.} The estimated effect is close to zero for low-secondary cities but about 0.05 (five percentage points) for high-secondary cities. This pattern is consistent with larger marginal abatement opportunities in industrial settings and implies that average impacts mask large differences in local welfare returns.

\textbf{Policy implications.} The results are consistent with \emph{targeted scaling} and \emph{capacity-backed implementation}. If near-term welfare gains through cleaner air are an objective, prioritisation may be most effective in industrial cities and high-abatement-potential sectors. Because improvements arrive with multi-year lags, programs may need to be designed and evaluated over realistic implementation horizons and tracked with intermediate milestones.

\textbf{Implementation considerations.}
\begin{enumerate}
  \item \textbf{Targeting:} consider prioritising cities with higher baseline industrial exposure when pilot resources are limited.
  \item \textbf{Monitoring:} aim for consistent air-quality and energy/emissions monitoring coverage to support accountability.
  \item \textbf{Upgrading support:} consider pairing targets with financing and technical assistance for retrofits and cleaner production.
  \item \textbf{Governance:} clarify and align responsibilities across agencies and track measurable intermediate milestones.
\end{enumerate}

Finally, the ETS comparison reinforces an evaluation lesson for energy policy: even when pooled point estimates are positive, pronounced pre-trends can signal selection or anticipation. Routine event-time diagnostics and staggered-adoption methods improve interpretability and reduce the risk of overclaiming \citep{SunAbraham2021}.

\section*{CRediT authorship contribution statement}
Zhilong Zhao: Conceptualization, Methodology, Formal analysis, Writing--original draft, Writing--review \& editing, Visualization.

\section*{Declaration of generative AI and AI-assisted technologies in the manuscript preparation process}
During the preparation of this manuscript, the author(s) used a generative AI tool (ChatGPT, OpenAI) for limited language editing (grammar and clarity) only. The tool was not used to generate research ideas, study design, analysis code, statistical analyses, results, or conclusions. The author(s) reviewed and edited all AI-assisted text and take full responsibility for the accuracy and integrity of the manuscript.

\section*{Funding}
The author(s) received no specific funding for this work.

\section*{Declaration of competing interest}
The author(s) declare that they have no competing interests to declare.

\section*{Data availability}
Replication code, dependency list, variable dictionary, and derived city-year panels sufficient to reproduce all tables and figures are publicly available on Harvard Dataverse (doi:10.7910/DVN/FRDJZZ). Raw upstream data are not redistributed due to third-party licensing; the replication package provides access instructions and scripts to reconstruct the derived panels from local copies of the upstream sources.

\section*{References}
\bibliographystyle{elsarticle-harv}
\bibliography{refs,refs_selected_unique}

\appendix
\setcounter{table}{0}
\setcounter{figure}{0}
\renewcommand{\theHtable}{\thesection.\arabic{table}}
\renewcommand{\theHfigure}{\thesection.\arabic{figure}}

\section{Appendix A. Robustness checks}

\subsection{Alternative samples and specifications}

Table~\ref{tab:appendix_robust} reports robustness checks for the main outcome. First, the estimated LCCP effect is stable across alternative monitoring thresholds (days$\ge$300 and days$\ge$350). Second, adding city-specific linear trends reduces the magnitude but retains a positive coefficient; this is consistent with gradual differential trends contributing to pooled estimates and motivates emphasis on event-time diagnostics. Third, excluding the four centrally administered municipalities yields a similar estimate to the baseline.

\begin{table}[htbp]
  \centering
  \small
  \caption{Robustness checks and complementary outcomes.}
  \label{tab:appendix_robust}
  \begin{tabularx}{\textwidth}{@{}Lrrr@{}}
    \toprule
    Specification & Coef. & SE & $N$ \\
    \midrule
    Good-day share $\sim$ LCCP DID (days$\ge$300) & 0.0319 & 0.0118 & 3795 \\
    Good-day share $\sim$ LCCP DID (days$\ge$330) & 0.0296 & 0.0120 & 3760 \\
    Good-day share $\sim$ LCCP DID (days$\ge$350) & 0.0276 & 0.0120 & 3685 \\
    Good-day share $\sim$ LCCP DID + city trends (HC1) & 0.0184 & 0.0082 & 3760 \\
    Good-day share $\sim$ LCCP DID (drop municipalities) & 0.0284 & 0.0121 & 3672 \\
    \midrule
    Good-day share $\sim$ ETS DID + city trends (HC1) & 0.0383 & 0.0158 & 3760 \\
    Good-day share $\sim$ ETS DID (drop municipalities) & 0.0300 & 0.0146 & 3672 \\
    \midrule
    log(CO$_2$/GDP) $\sim$ LCCP DID (TWFE) & -0.0015 & 0.0407 & 6858 \\
    log(CO$_2$/GDP) $\sim$ LCCP DID + city trends (HC1) & -0.0185 & 0.0142 & 6858 \\
    \bottomrule
  \end{tabularx}
  \begin{minipage}{\textwidth}
    \footnotesize\raggedright Notes: ``ETS'' denotes emissions trading scheme pilot cities. City-trend specifications include city-specific linear trends and use HC1 standard errors due to numerical instability of clustered SE with high-dimensional trend interactions.
  \end{minipage}
\end{table}

\subsection{Placebo adoption test}

To probe pre-trends and anticipation, we implement placebo adoption tests. For treated cities, we shift the LCCP pilot year earlier and estimate the DID model using only pre-treatment city-years (years strictly before the true adoption year). Placebo estimates are close to zero and statistically insignificant across shifts of 2--4 years (Table~\ref{tab:appendix_placebo}).

\begin{table}[htbp]
  \centering
  \small
  \caption{Placebo test for LCCP (pilot year shifted earlier).}
  \label{tab:appendix_placebo}
  \begin{tabularx}{\textwidth}{@{}Lrrr@{}}
    \toprule
    Specification & Coef. & SE (clustered) & $N$ \\
    \midrule
    Good-day share $\sim$ placebo LCCP DID (pilot$-2$, pre-treatment only) & -0.0041 & 0.0135 & 2515 \\
    Good-day share $\sim$ placebo LCCP DID (pilot$-3$, pre-treatment only) & -0.0070 & 0.0178 & 2515 \\
    Good-day share $\sim$ placebo LCCP DID (pilot$-4$, pre-treatment only) & -0.0132 & 0.0202 & 2515 \\
    \bottomrule
  \end{tabularx}
  \begin{minipage}{\textwidth}
    \footnotesize\raggedright Notes: Sample is restricted to city-years with days$\ge$330 and to pre-treatment years for treated cities (year $<$ true pilot year).
  \end{minipage}
\end{table}

\subsection{Lead-joint tests for event studies}

For transparency, Table~\ref{tab:appendix_leads} reports joint tests of event-study lead coefficients (event time $\le -2$). For LCCP, we do not reject differential pre-trends in the baseline specification (p$\approx$0.16). For carbon-trading pilots, lead coefficients are strongly jointly significant (p$\approx$0.00), consistent with selective placement and/or anticipation.

\begin{table}[htbp]
  \centering
  \caption{Lead-joint tests for cohort-weighted event studies.}
  \label{tab:appendix_leads}
  \begin{tabular}{lrr}
    \toprule
    Policy & Lead joint test p-value & Lead window \\
    \midrule
    LCCP $\rightarrow$ good-day share & 0.16 & event time $\le -2$ \\
    ETS $\rightarrow$ good-day share & 0.00 & event time $\le -2$ \\
    \bottomrule
  \end{tabular}
  \begin{minipage}{\textwidth}
    \footnotesize\raggedright Notes: Event-study coefficients are estimated using the cohort-specific method of \citet{SunAbraham2021} with city and year fixed effects.
  \end{minipage}
\end{table}

\subsection{Policy overlap: excluding ETS pilot cities/periods}

Because China’s pilot policy environment involves overlapping instruments, we examine whether the main LCCP estimate is sensitive to excluding cities and periods exposed to the ETS pilot. Table~\ref{tab:appendix_overlap} reports two variants: (i) dropping ETS pilot cities entirely, and (ii) keeping ETS pilot cities but dropping their post-ETS years (retaining pre-ETS observations). In both cases, the estimated LCCP effect remains positive and of similar magnitude.

\begin{table}[htbp]
  \centering
  \small
  \caption{LCCP DID excluding ETS overlap.}
  \label{tab:appendix_overlap}
  \begin{tabularx}{\textwidth}{@{}Lrrr@{}}
    \toprule
    Sample restriction & Coef. (LCCP DID) & SE (clustered) & $N$ \\
    \midrule
    Drop ETS pilot cities & 0.0329 & 0.0129 & 3138 \\
    Drop ETS post years (keep pre-ETS only) & 0.0272 & 0.0121 & 3324 \\
    \bottomrule
  \end{tabularx}
  \begin{minipage}{\textwidth}
    \footnotesize\raggedright Notes: Baseline outcome sample (days$\ge$330). ``Drop ETS post years'' removes observations with year $\ge$ city-specific ETS pilot year in ETS pilot cities.
  \end{minipage}
\end{table}

\subsection{Alternative event-study windows}

To assess sensitivity to the event window choice, we re-estimate the LCCP cohort-weighted event study using two alternative windows: $[-6,6]$ and $[-10,8]$. Figure~\ref{fig:appendix_window_short} and Figure~\ref{fig:appendix_window_long} show that the qualitative dynamic pattern---a lagged improvement after adoption---is preserved. Lead-joint tests remain non-rejecting (p=0.35 for $[-6,6]$ and p=0.11 for $[-10,8]$).

\begin{figure}[htbp]
  \centering
  \includegraphics[width=0.92\textwidth]{figures/Figure_A1.png}
  \caption{LCCP event study with alternative window $[-6,6]$ (days$\ge$330).}
  \label{fig:appendix_window_short}
\end{figure}

\begin{figure}[htbp]
  \centering
  \includegraphics[width=0.92\textwidth]{figures/Figure_A2.png}
  \caption{LCCP event study with alternative window $[-10,8]$ (days$\ge$330).}
  \label{fig:appendix_window_long}
\end{figure}

\subsection{Stacked DID estimate}

To complement cohort-weighted event studies, we estimate a stacked DID following a cohort-by-cohort stacking logic. For each adoption cohort $g \in \{2010, 2012, 2017\}$, we stack a cohort-specific sample containing (i) cities treated in cohort $g$ and (ii) never-treated cities as controls, restricting to an event window of $[-8,6]$ around adoption. We then estimate a DID with cohort-specific city and year fixed effects (city$\times$cohort and year$\times$cohort), clustering at the original city level to allow correlation across stacked replications. Table~\ref{tab:appendix_stacked} reports the stacked estimate, which is close to the baseline DID magnitude.

\begin{table}[htbp]
  \centering
  \small
  \caption{Stacked DID estimate for LCCP.}
  \label{tab:appendix_stacked}
  \begin{tabularx}{\textwidth}{@{}Lrrr@{}}
    \toprule
    Specification & Coef. & SE (clustered) & $N$ \\
    \midrule
    Good-day share $\sim$ LCCP DID (stacked; window $[-8,6]$) & 0.0319 & 0.0127 & 4728 \\
    \bottomrule
  \end{tabularx}
  \begin{minipage}{\textwidth}
    \footnotesize\raggedright Notes: Baseline outcome sample (days$\ge$330). Controls are never-treated cities; treated cities in other cohorts are excluded from each cohort-specific stack.
  \end{minipage}
\end{table}

\subsection{Permutation test (pilot-year reassignment)}

As a non-parametric complement, we implement a permutation test that randomly reassigns LCCP pilot years among treated cities (holding the set of treated cities and the cohort size distribution fixed) and recomputes the TWFE DID coefficient with city and year fixed effects. Figure~\ref{fig:appendix_perm} shows the distribution of placebo coefficients across 400 draws. The empirical two-sided p-value is approximately 0.10, indicating that the observed estimate is toward the upper tail of the permutation distribution.

\begin{figure}[htbp]
  \centering
  \includegraphics[width=0.92\textwidth]{figures/Figure_A3.png}
  \caption{Permutation test: reassigning LCCP pilot years among treated cities (400 draws).}
  \label{fig:appendix_perm}
\end{figure}

\end{document}
